Read prepared data.
subscriptions <- read_rds('../data/subscriptions.rds')
summary(subscriptions)
customerid subscriptionid revenuecurr billingcurrency startmonth
Min. : 10006 Min. : 154 Min. : 0.8 DKK :668540 Min. :2003-12-01
1st Qu.: 1090111 1st Qu.: 5121661 1st Qu.: 15.0 EUR :532574 1st Qu.:2012-12-01
Median : 5319576 Median :14784414 Median : 63.2 USD :505039 Median :2014-12-01
Mean : 8044651 Mean :14612438 Mean : 1382.8 NOK :232550 Mean :2014-05-02
3rd Qu.:14410806 3rd Qu.:24044332 3rd Qu.: 174.0 SEK :159059 3rd Qu.:2016-07-01
Max. :23917123 Max. :29424356 Max. :2823000.0 GBP : 96217 Max. :2018-03-01
(Other): 89185
endmonth months status num_previous_months num_previous_subs
Min. :2004-03-01 Min. : 1.000 active:1840623 Min. : 0.00 Min. : 0.000
1st Qu.:2013-04-01 1st Qu.: 1.000 churn : 442541 1st Qu.: 1.00 1st Qu.: 1.000
Median :2015-03-01 Median : 3.000 Median : 8.00 Median : 3.000
Mean :2014-08-18 Mean : 3.532 Mean : 15.34 Mean : 6.028
3rd Qu.:2016-11-01 3rd Qu.: 3.000 3rd Qu.: 22.00 3rd Qu.: 8.000
Max. :2020-03-01 Max. :24.000 Max. :162.00 Max. :68.000
num_previous_months_binned firstpaiddate channelcat paymentperiodchosenatstart currency
Min. : 0.00 Min. :2003-12-15 paid :1518216 Min. :-1.000 DKK :668679
1st Qu.: 1.00 1st Qu.:2010-12-12 viral: 764948 1st Qu.: 1.000 USD :543314
Median : 8.00 Median :2013-06-23 Median : 3.000 EUR :537318
Mean :14.72 Mean :2013-01-16 Mean : 4.237 NOK :232433
3rd Qu.:26.00 3rd Qu.:2015-08-19 3rd Qu.: 3.000 SEK :158858
Max. :39.00 Max. :2018-03-26 Max. :24.000 GBP : 93847
(Other): 48715
marketname siteverkey firstpaidmonth firstdevice segment isquickpurchase
DK :669017 US :898770 Min. :2003-12-01 desktop: 175519 business: 407581 Min. :0.0000
NO :232881 DK :637715 1st Qu.:2010-12-01 mobile : 70157 other : 97467 1st Qu.:0.0000
US :188723 NO :207482 Median :2013-06-01 NA's :2037488 personal: 293518 Median :1.0000
SE :159905 SE :144173 Mean :2013-01-02 NA's :1484598 Mean :0.5282
FR :146174 FR : 96298 3rd Qu.:2015-08-01 3rd Qu.:1.0000
(Other):886057 NL : 79701 Max. :2018-03-01 Max. :1.0000
NA's : 407 (Other):219025 NA's :306
productversion isfreemium model31224 market_category
v_3 : 898821 Min. :0.0000 pre-changes :1894566 DK :669017
v_4 : 138959 1st Qu.:0.0000 3-12m-v1-2015-11-30 : 145756 LowGeneric:265151
v_older:1245078 Median :0.0000 3-12-24m-2016-08-25 : 144551 NO :232881
NA's : 306 Mean :0.2547 12m-v2-2015-05-16 : 71866 US :188723
3rd Qu.:1.0000 ex-subscriptionplanpaywallexpand: 8305 SE :159905
Max. :1.0000 ex-uglyjerry12mswitch : 5388 FR :146174
NA's :306 (Other) : 12732 (Other) :621313
siteverkey_cat siteverkey_cat2 chosen_subs_length gdppercapita gdppercapita_scaled
ORG:1384394 MUT:1311010 1 : 56935 Min. : 218.3 Min. :-2.4437
SS : 898770 ORG: 402042 -1 : 11140 1st Qu.: 42013.3 1st Qu.:-0.4694
SS : 570112 12 : 51360 Median : 55670.9 Median : 0.1758
24 : 6897 Mean : 51950.1 Mean : 0.0000
3 : 204319 3rd Qu.: 60637.3 3rd Qu.: 0.4104
gen:1952513 Max. :108422.5 Max. : 2.6676
subscription_summary subscription_summary_no_market
mc-DK_ssc-MUT_ac-39_m-3_ccsl-gen : 109608 ssc-MUT_ac-26_m-3_ccsl-gen: 203834
mc-DK_ssc-MUT_ac-26_m-3_ccsl-gen : 77645 ssc-MUT_ac-39_m-3_ccsl-gen: 170628
mc-LowGeneric_ssc-SS_ac-0_m-1_ccsl-3: 60652 ssc-SS_ac-0_m-1_ccsl-3 : 161585
mc-DK_ssc-ORG_ac-0_m-3_ccsl-gen : 58390 ssc-ORG_ac-0_m-3_ccsl-gen : 132393
mc-DK_ssc-MUT_ac-38_m-3_ccsl-gen : 51997 ssc-MUT_ac-8_m-3_ccsl-gen : 120482
mc-DK_ssc-ORG_ac-3_m-3_ccsl-gen : 41933 ssc-MUT_ac-38_m-3_ccsl-gen: 111133
(Other) :1882939 (Other) :1383109
subscriptions_with_target <- subscriptions %>%
# restrict to a recent expiry window
filter(endmonth >= begin_train_window & endmonth < end_window) %>%
mutate(num_previous_months_binned_fct = as.factor(num_previous_months_binned)) %>%
mutate(set_type = as.factor(if_else(endmonth >= begin_validation_window, 'validation', 'training'))) %>%
mutate(churnind = ifelse(status == 'churn', 1, 0))
Prepare churntable that we want to predict.
churntable <- subscriptions_with_target %>%
group_by(set_type, siteverkey_cat2, market_category, months, num_previous_months_binned, chosen_subs_length, subscription_summary_no_market) %>%
summarise(num_obs = n(),
churned = sum(churnind)) %>%
group_by(set_type) %>%
mutate(churn_rate = churned / num_obs,
renew_rate = 1 - churn_rate,
month_churn = 1 - renew_rate ^ (1/as.double(months)),
log_month_churn = log(month_churn),
weight = num_obs / sum(num_obs))
# NB! Does this introduce a bad bias ????
churntable_no_zeros <- churntable %>%
filter(churn_rate > 0)
Train model
new_model=glm(log_month_churn ~ market_category + subscription_summary_no_market, data=churntable_no_zeros[churntable_no_zeros$set_type == 'training', ], weights = weight)
Model validation for training (2017-01-01 - 2017-08-01) and validation (2017-09-01 - 2018-01-01) sets:
prediction_table <- validation(subscriptions_with_target, new_model, predict_2fct_model)
validation_plots(prediction_table)
NAs introduced by coercion
Try simple logistic model
model_logit <- glm(churnind ~ market_category + siteverkey_cat2 + num_previous_months_binned + months + chosen_subs_length,
data = subscriptions_with_target[subscriptions_with_target$set_type == 'training',], family = 'binomial')
Model validation for training (2017-01-01 - 2017-08-01) and validation (2017-09-01 - 2018-01-01) sets:
prediction_table_logit <- validation(subscriptions_with_target, model_logit)
validation_plots(prediction_table_logit)
NAs introduced by coercion